Setup

Load packages

library(data.table)
library(stringr)
library(ggplot2)
theme_py <- theme_light() + theme(
  panel.grid.major = element_blank(),
  panel.grid.minor = element_blank(),
  panel.border = element_rect(colour = "black", fill = NA),
  text = element_text(size = 20),
  strip.placement = "outside",
  strip.text = element_text(size=20, color = "black"),
  strip.background = element_rect(fill = "white")
)
theme_set(theme_py)
library(patchwork)
library(ggrepel)
library(ComplexHeatmap)
library(DESeq2)
library(consensusSeekeR)
library(BSgenome.jaNemVect1.1.DToL.Assembly)
library(GenomicRanges)
library(parallel)
library(universalmotif)
library(monaLisa)
library(rtracklayer)
library(ComplexUpset)
source("../motif-analysis/mta_downstream_functions.R")
source("scripts/functions.R")

Directories and colors

dat_dir <- "ATACSEQ/nucleosome_free_regions/"
pks_dir <- file.path(dat_dir, "macs2_peaks")
cns_dir <- file.path(dat_dir, "consensus_peaks")
hom_dir <- file.path(dat_dir, "homer")
ftp_dir <- file.path(dat_dir, "footprint")
bind_dir <- file.path(dat_dir, "footprint", "BINDetect")
res_dir <- file.path(dat_dir, "results")
fig_dir <- file.path(dat_dir, "plots")
for (newdir in c(cns_dir, hom_dir, res_dir, fig_dir))
  dir.create(newdir, showWarnings = FALSE)

# colors
condition_cols <- c("positive" = "blue", "negative" = "red")
line_cols <- c("Elav" = "#ff7f00", "Fox" = "#984ea3", "Ncol" = "#4daf4a")
line_cond_cols <- c(
  "Elav_pos" = "#ff7f00", "Fox_pos" = "#984ea3", "Ncol_pos" = "#4daf4a",
  "Elav_neg" = "#ebbd8f", "Fox_neg" = "#d18adb", "Ncol_neg" = "#90d18e"
)

Gene annotations

marks_gold <- fread(
  file.path("annotation", "golden-marks-231124.tsv"),
  fill = TRUE, sep = "\t", header = FALSE
)[, c(2:1)]
setnames(marks_gold, c("gene", "name"))
marks_gold_v <- structure(marks_gold$name, names = marks_gold$gene)

tfs_annt <- fread(
  file.path("annotation", "tfs.Nvec.tsv"),
  header = TRUE
)[, .SD[1], gene]
setnames(tfs_annt, c("gene", "name", "pfam"))

gene_annt <- fread(
  file.path("annotation", "Nvec_annotation_v3_2020-10-23_DToL_names"),
  select = 1:3
)
setnames(gene_annt, c("gene", "pfam", "name"))

gene_ann <- rbindlist(list(
  gene_annt[!gene %in% tfs_annt$gene],
  tfs_annt
), use.names = TRUE)
gene_ann[gene %in% marks_gold$gene, name := marks_gold_v[gene]]

Peaks counts

Find consensus set of peaks

require(consensusSeekeR)
require(BSgenome.jaNemVect1.1.DToL.Assembly)
require(parallel)

# load peaks
pks_files <- list.files(
  pks_dir,
  pattern = "narrowPeak",
  recursive = FALSE,
  full.names = TRUE
)
names(pks_files) <- str_remove(
  basename(pks_files),
  ".mLb.clN.ncfree_peaks.narrowPeak"
)
nP_list <- lapply(names(pks_files), function(x) {
    nP <- readNarrowPeakFile(
      pks_files[x],
      extractRegions = TRUE,
      extractPeaks = TRUE
    )
    names(nP$narrowPeak) <- rep(x, length(nP$narrowPeak))
    names(nP$peak) <- rep(x, length(nP$peak))
    nP
})
regions <- GenomicRanges::GRangesList(lapply(nP_list, function(x) x$narrowPeak))
peaks <- GenomicRanges::GRangesList(lapply(nP_list, function(x) x$peak))
names(regions) <- names(pks_files)
names(peaks) <- names(pks_files)

# get consensus
chrList <- Seqinfo(
    seqnames = seqnames(BSgenome.jaNemVect1.1.DToL.Assembly),
    seqlengths = seqlengths(BSgenome.jaNemVect1.1.DToL.Assembly),
    isCircular = c(
      rep(FALSE, length(seqnames(BSgenome.jaNemVect1.1.DToL.Assembly)) - 1),
      TRUE
    ),
    genome = "jaNemVect1.1"
)
message(Sys.time(), " Started calculating consensus")
ur <- unlist(regions)
up <- unlist(peaks)

results <- findConsensusPeakRegions(
    narrowPeaks = ur,
    peaks = up,
    chrInfo = chrList,
    extendingSize = 250,
    expandToFitPeakRegion = FALSE,
    shrinkToFitPeakRegion = FALSE,
    minNbrExp = 2,
    nbrThreads = detectCores() - 1
)
message(Sys.time(), " Done calculating consensus")
saveRDS(results, file.path(cns_dir, "consensusSeekeR-results.RDS"))
saveRDS(results, file.path(cns_dir, "consensusSeekeR-results.RDS"))

# resize peaks
pks_cns <- results$consensusRanges
pks_mid <- start(pks_cns) + (end(pks_cns) - start(pks_cns)) / 2
ranges(pks_cns) <- IRanges(pks_mid, width = 0)
pks_scl <- promoters(pks_cns, upstream = 125, downstream = 125)

# trim out-of-bound peaks
seqlengths(pks_scl) <- seqlengths(
  BSgenome.jaNemVect1.1.DToL.Assembly
)[names(seqlengths(pks_scl))]
pks_scl <- trim(pks_scl)

# save bed file
pks_bed <- as.data.table(pks_scl)
pks_bed[, name := paste0("peak", seq_len(.N))]
pks_bed[, width := NULL][, score := "."]
setcolorder(pks_bed, c("seqnames", "start", "end", "name", "score", "strand"))
fwrite(
  pks_bed,
  file.path(cns_dir, "consensusSeekeR-peaks.bed"),
  sep = "\t",
  col.names = FALSE
)

Get scores for consensus peaks in all samples.

nth=12
res="ATACSEQ/nucleosome_free_regions/consensus_peaks/consensusSeekeR-peaks-counts.tsv"
bed="ATACSEQ/nucleosome_free_regions/consensus_peaks/consensusSeekeR-peaks.bed"
bam=$( echo ATACSEQ/nucleosome_free_regions/bam/*R*bam )

cols=$( echo -e seqnames start end name score strand | column -t )
for b in $bam
do 
  echo $b
  name=$( basename $b)
  cols=$( echo -e ${cols} ${name} | column -t )
done

echo -e ${cols} > ${res%%tsv}txt
bedtools multicov -bams ${bam} -bed ${bed} > ${res}

Data for differential peaks analysis

# load counts in peaks
pks_ct <- fread(
  file.path(cns_dir, "consensusSeekeR-peaks-counts.tsv"),
  header = FALSE
)
colnames <- readLines(
  file.path(cns_dir, "consensusSeekeR-peaks-counts.txt"),
  n = 1
)
colnames <- str_remove_all(colnames, ".mLb.clN.ncfree.sorted.bam")
colnames <- str_split(colnames, " ")[[1]]
colnames(pks_ct) <- colnames

# column data
col_dt <- data.table(sample = colnames(pks_ct)[7:28])
col_dt[, reporterline := str_extract(sample, "Elav|Fox|Ncol")]
col_dt[, reporterline := factor(reporterline, levels=c("Elav", "Fox", "Ncol"))]
col_dt[, condition := str_extract(sample, "pos|neg")]
col_dt[, condition := str_replace_all(
  condition,
  c("pos" = "positive", "neg" = "negative")
)]
col_dt[, condition := factor(condition, levels = c("negative", "positive"))]
col_dt[, group := paste(
  as.character(reporterline), as.character(condition), sep = ""
), by = seq_len(nrow(col_dt))]
col_dt[, group := factor(group)]
fwrite(col_dt, file.path(res_dir, "design.tsv"), sep = "\t")

col_df <- copy(col_dt)
class(col_df) <- "data.frame"
rownames(col_df) <- col_df$sample

# counts matrix
pks_mt <- as.matrix(pks_ct[, -c(1:6)])
rownames(pks_mt) <- pks_ct$name
pks_mt <- pks_mt[, rownames(col_df)]
write.table(
  pks_mt,
  file.path(res_dir, "mat.tsv"),
  sep = "\t",
  quote = FALSE
)

# DESeq2
require(DESeq2)
dds <- DESeqDataSetFromMatrix(
  countData = pks_mt,
  colData = col_df,
  design = ~ condition + reporterline
)
saveRDS(dds, file.path(res_dir, "dds.rds"))

Peaks counts distributions

# normalized accessibility distribution
pks_dt <- as.data.table(pks_mt, keep.rownames = "peak")
pks_dt <- melt.data.table(
  pks_dt,
  id.vars = "peak",
  variable.name = "sample",
  value.name = "norm_counts"
)
pks_dt <- merge.data.table(pks_dt, col_dt, by = "sample")
setorder(pks_dt, peak, condition, reporterline)
pks_dt[, sample := factor(sample, levels = unique(pks_dt$sample))]

gp_acc <- ggplot(pks_dt, aes(sample, log10(norm_counts), fill = reporterline)) +
  geom_violin(scale = "width", alpha = 0.8, color = "black") +
  geom_boxplot(width = 0.5, outlier.shape = NA, alpha = 0.8, color = "black") +
  scale_fill_manual(values = line_cols, limits = force) +
  scale_y_continuous(expand = expansion(mult = c(0, 0.01))) +
  labs(x = "samples", y = "peak\naccessibility") +
  theme(
    axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1),
    legend.title = element_blank(), legend.position = "none"
  )

var_dt <- pks_dt[, .(var = var(norm_counts)), peak]
gp_var <- ggplot(var_dt, aes(log10(var))) +
  geom_density() +
  scale_x_continuous(limits = c(-4, NA)) +
  scale_y_continuous(expand = expansion(mult = c(0, 0.01))) +
  labs(x = "log10(accessibility variance)")

gp_pch <- gp_acc / gp_var + plot_layout(heights = c(2, 1))
gp_pch

Use normalized log-transformed accessibility data

# normalize samples
dds <- readRDS(file.path(res_dir, "dds.rds"))
dds <- estimateSizeFactors(dds)
norm_mt <- counts(dds, normalized = TRUE)

# save
write.table(
  norm_mt,
  file.path(res_dir, "mat_norm.tsv"),
  sep = "\t",
  row.names = TRUE,
  quote = FALSE
)

# row normalize to bring peaks to same range
norm_mt <- (norm_mt + 10) / apply(norm_mt + 10, 1, median)
norm_mt <- log2(norm_mt)

# save
write.table(
  norm_mt,
  file.path(res_dir, "mat_norm_fc.tsv"),
  sep = "\t",
  row.names = TRUE,
  quote = FALSE
)

PCA

PCA on all samples.

set.seed(1950)
pca_res <- prcomp(t(norm_mt), center = TRUE)

# variance explained
pca_var <- data.table(
  pct_var = round(((pca_res$sdev) ^ 2 / sum((pca_res$sdev) ^ 2)* 100), 2)
)
pca_var[,pct_cum:=cumsum(pct_var)]
pca_var[,PC:=factor(1:.N)]
gp_var <- ggplot(pca_var, aes(PC, pct_var)) +
  geom_bar(stat = "identity") +
  geom_line(aes(y = pct_cum, group = 1)) +
  geom_point(aes(y = pct_cum)) +
  scale_y_continuous(expand = expansion(0.01,0)) +
  labs(y = "% of variance\nexplained", x = "PC") +
  theme(panel.grid.major.y = element_line(size = 0.5))

pca_dt <- as.data.table(pca_res$x, keep.rownames = "sample")
pca_dt <- merge.data.table(col_dt, pca_dt, by="sample", sort=FALSE)
gp_bip <- ggplot(pca_dt, aes(PC1, PC2, fill=reporterline, shape=condition)) +
  geom_point(size = 5) +
  scale_fill_manual(values = line_cols) +
  scale_shape_manual(values = c("positive" = 21, "negative" = 24)) +
  guides(fill = guide_legend(override.aes=list(shape=21))) +
  geom_text_repel(aes(label = sample))

gp_pch <- gp_var / gp_bip
gp_pch

PCA per cell line

set.seed(1950)

gp_l <- lapply(names(line_cols), function(cl) {

  pca_res <- prcomp(t(norm_mt[,grep(cl,colnames(norm_mt))]), center = TRUE)

  # variance explained
  pca_var <- data.table(
    pct_var = round(((pca_res$sdev) ^ 2 / sum((pca_res$sdev) ^ 2)* 100), 2)
  )
  pca_var <- pca_var[-nrow(pca_var)]
  pca_var[,pct_cum:=cumsum(pct_var)]
  pca_var[,PC:=factor(1:.N - 1)]
  gp_var <- ggplot(pca_var, aes(PC, pct_var)) +
    geom_bar(stat = "identity") +
    geom_line(aes(y = pct_cum, group = 1)) +
    geom_point(aes(y = pct_cum)) +
    scale_y_continuous(expand = expansion(0.01,0)) +
    labs(y = "% of variance\nexplained", x = "PC") +
    theme(panel.grid.major.y = element_line(size = 0.5))
  
  # pca plot
  pca_dt <- as.data.table(pca_res$x, keep.rownames = "sample")
  pca_dt <- merge.data.table(col_dt, pca_dt, by="sample", sort=FALSE)
  gp_bip <- ggplot(pca_dt, aes(PC1, PC2, fill=condition, shape=condition)) +
    geom_point(size=5) +
    scale_shape_manual(values = c("positive" = 21, "negative" = 24)) +
    scale_fill_manual(values = condition_cols) +
    guides(fill = guide_legend(override.aes=list(shape=21))) +
    geom_text_repel(aes(label = sample))
  
  gp_var / gp_bip 

})
gp_l
## [[1]]

## 
## [[2]]

## 
## [[3]]

Marker peaks

Use normalized peak counts

norm_mt <- read.table(
  file.path(res_dir, "mat_norm_fc.tsv"),
  header = TRUE
)

Identify marker peaks

# gene markers by high FC + significant DEseq2 LTR test 
dds <- readRDS(file.path(res_dir, "dds.rds"))
dds <- DESeq(dds, test="LRT", reduced=~1)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
dds_res <- results(dds)
dds_qval <- dds_res$padj 
names(dds_qval) <- rownames(dds_res)
peaks_deseq <- names(which(dds_qval<1e-2))
peaks_high <- names(which(apply(
  norm_mt, 1, function(x) sort(x, decreasing = TRUE)[2] >= 1.8
)))
peaks_marks <- intersect(peaks_high, peaks_deseq)
length(peaks_marks)
## [1] 2844

Cluster peaks

set.seed(1950)

# determine k for kmeans
ks <- 1:30
tot_withinss <- sapply(ks,  function(k) {
  cl <- kmeans(norm_mt[peaks_marks,], k)
  cl$tot.withinss
})
elbow_dt <- data.table(k = ks, tot_withinss = tot_withinss)
elbow_gp <- ggplot(elbow_dt, aes(x = k, y = tot_withinss)) +
  geom_line() + geom_point()+
  scale_x_continuous(breaks = ks)
elbow_gp

Cluster peaks

# kmeans
set.seed(1950)
k <- 19
cl <- kmeans(norm_mt[peaks_marks,], k)
peaks_order_list <- tapply(names(cl$cluster), cl$cluster, function(gs) {
  cor_peaks <- cor(t(norm_mt[gs,]))
  hclust_peaks <- hclust(as.dist(
    1 - cor(cor_peaks)),
    method = "ward.D2"
  )
  rownames(cor_peaks)[hclust_peaks$order]
})
names(peaks_order_list) <- unique(cl$cluster)
peaks_order_list <- peaks_order_list[as.character(seq_along(peaks_order_list))]

# cluster clusters
cluster_order <- hclust(dist(
  cor(t(cl$centers)), method = "euclidean"
), method = "ward.D2")$order
peaks_order_list <- peaks_order_list[as.character(cluster_order)]
peaks_order <- unname(unlist(peaks_order_list))
clusters_dt <- data.table(
  peak = peaks_order,
  clusters = as.character(rep(
    names(peaks_order_list),
    sapply(peaks_order_list, length)
  ))
)

# group clusters (manually)
clusters_dt[clusters %in% c(16), group :=1 ] # Elav
clusters_dt[clusters %in% c(5), group :=2 ] # Elav + Fox
clusters_dt[clusters %in% c(14,17), group := 3] # Fox
clusters_dt[clusters %in% c(6,11), group := 4] # Fox + Ncol
clusters_dt[clusters %in% c(9), group := 5]
clusters_dt[clusters %in% c(13,4), group := 6] # Elav + Ncol
clusters_dt[clusters %in% c(1,2,15,7,8,10,19,18), group := 7] # Ncol
clusters_dt[clusters %in% c(12), group := 8]
clusters_dt[clusters %in% c(3), group := 9]
setorder(clusters_dt, group)
peaks_order <- clusters_dt$peak

# add clusters info to peaks coordinates bed file
pks_bed <- fread(file.path(cns_dir, "consensusSeekeR-peaks.bed"))
setnames(pks_bed, c("V4"), c("peak"))
clusters_bed <- merge.data.table(
  pks_bed, clusters_dt,
  by = "peak",
  sort = FALSE
)
setcolorder(clusters_bed, c(colnames(pks_bed)))
clusters_bed[, clusters := paste0("C", clusters)]
clusters_bed[, group := paste0("G", group)]
fwrite(
  clusters_bed,
  file.path(res_dir, "consensusSeekeR-peaks-clusters.bed"),
  sep = "\t",
  col.names = FALSE
)

Heatmap of markers

# order rows and columns
samples_order <- col_dt[order(condition,reporterline)]$sample
plot_mt <- norm_mt[peaks_order,samples_order]

# center at 0
plot_min <- quantile(abs(range(plot_mt)),0.75)
plot_min <- 5
plot_mt <- pmin(pmax(plot_mt,-plot_min),plot_min)

# heatmap colors
col_vec <- colorRampPalette(
  RColorBrewer::brewer.pal(11,'BrBG')
)(1000)
col_fun <- circlize::colorRamp2(
  seq(-plot_min, plot_min, length.out = length(col_vec)),
  col_vec
)

# color annotaitons
col_ann <- HeatmapAnnotation(
    which = "column", border = TRUE,
    "reporterline" = as.character(
      col_dt[match(colnames(plot_mt),sample)]$reporterline
    ),
    "condition" = as.character(
      col_dt[match(colnames(plot_mt),sample)]$condition
    ), 
    col = list(
      "reporterline" = line_cols,
      "condition" = condition_cols
    )
)

# # peak module annotations (clusters)
# clann <- clusters_dt[match(rownames(plot_mt),peak)]$clusters
# clann_lab <- unique(clusters_dt[match(rownames(plot_mt),peak)]$clusters)
# clann <- factor(clann, levels=clann_lab)
# module_ann <- HeatmapAnnotation(
#     which = "row", border = TRUE,
#     "cluster" = anno_block(
#       labels = clann_lab,
#       gp = gpar(col=NA)
#     )
# )
# row_split <- clann

# peak module annotations (manually groupped clusters)
grann <- clusters_dt[match(rownames(plot_mt),peak)]$group
grann_lab <- unique(clusters_dt[match(rownames(plot_mt),peak)]$group)
grann <- factor(grann, levels=grann_lab)
module_ann <- HeatmapAnnotation(
    which = "row", border = TRUE,
    "group" = anno_block(
      labels = grann_lab,
      gp = gpar(col=NA)
    )
)
row_split <- grann

# peaks annotations
row_labels_marks_ids <- match(
  clusters_dt[,.SD[1],clusters]$peak,
  rownames(plot_mt)
)
row_labels_marks <- clusters_dt[,.SD[1], clusters]$clusters
mark_ann <- HeatmapAnnotation(
  which = "row",
  marker = anno_mark(at = row_labels_marks_ids, labels = row_labels_marks),
  show_legend = FALSE
)
mark_ann <- HeatmapAnnotation(
    which = "row", border = TRUE,
    "padj<0.05" = rownames(plot_mt) %in% peaks_deseq,
    "var>1" = rownames(plot_mt) %in% peaks_vari,
    "FC>2" = rownames(plot_mt) %in% peaks_fc,
    col = list(
      "padj<0.05" = c("TRUE" = "#e6ab02", "FALSE"="#d9d9d9"),
      "var>1" = c("TRUE" = "#3690c0", "FALSE"="#d9d9d9"),
      "FC>2" = c("TRUE" = "#e7298a", "FALSE"="#d9d9d9")
    )
)

# heatmap
hm <- Heatmap(
  plot_mt, name = "normalized\naccessibility",
  col = col_fun,
  cluster_rows = FALSE, cluster_columns = FALSE,
  show_row_names = FALSE, show_column_names = TRUE,
  row_title = "peaks", 
  row_split = row_split, 
  cluster_row_slices = FALSE,
  top_annotation = col_ann,  
  left_annotation = module_ann, right_annotation = mark_ann,
  border = TRUE
)
hm

Peak to gene assignment

Load data

# peaks
peaks <- fread(file.path(cns_dir, "consensusSeekeR-peaks.bed"))
setnames(peaks, c("seqnames", "start", "end", "name", "score", "strand"))
peaks_gr <- makeGRangesFromDataFrame(peaks, keep.extra.columns = TRUE)

# genes
genes_gr <- rtracklayer::import(
  "annotation/Nvec_v4_merged_annotation_sort.gtf",
  "gtf"
)
genes_gr <- genes_gr[genes_gr$type == "transcript"]
genes_gr$name <- genes_gr$transcript_id

# non-expressed genes to remove
counts <- read.table(
  "RNASEQ_QUANTIFICATION/raw_counts_rnaseq.tsv",
  header = TRUE,
  row.names = 1
)
exclude_genes <- names(which(
  apply(counts, 1, function(x) sum(x) < 10)
))
length(exclude_genes) # 437
## [1] 437

Assign peaks to genes

# assign
assign <- mta_match_peaks_to_genes(
  gff_object = genes_gr,
  peak_object = peaks_gr,
  index_object = "genome/Nvec_vc1.1_gDNA.fasta.fai",
  list_genes = NULL,
  feature_to_match = "transcript",
  feature_field = "name",
  exclude_genes = NULL,
  max_tss_dist = 10000,
  min_overlap = 0,
  max_gap = 1,
  promoter_upstream = 200,
  promoter_downstream = 50,
  promoter_object = NULL
)
setDT(assign)
setnames(assign, "chr", "seqnames")
assign[, seqnames := factor(seqnames, levels = unique(peaks$seqnames))]
setorder(assign, seqnames, start, end)

# save
fwrite(
  assign,
  file.path(res_dir, "consensusSeekeR-peaks-gene-assignment.tsv"),
  sep = "\t",
  col.names = TRUE
)

Inspect peak assignment results

assign <- fread(file.path(res_dir, "consensusSeekeR-peaks-gene-assignment.tsv"))

max_assign_peaks_per_gene <- unique(
  assign[, .(peak, gene)][!is.na(gene)]
)[, .N, gene]
gb1 <- ggplot(max_assign_peaks_per_gene, aes(N)) +
  geom_bar() +
  labs(x = "peaks per gene", y = "genes") +
  scale_y_continuous(expand = expansion(mult = c(0, 0.01))) +
  scale_x_continuous(
    breaks = c(1, seq(5, max(max_assign_peaks_per_gene$N), 5))
  )

max_assign_gene_per_peak <- unique(
  assign[, .(peak, gene)][!is.na(gene)]
)[, .N, peak]
gb2 <- ggplot(max_assign_gene_per_peak, aes(N)) +
  geom_bar() +
  labs(x = "genes per peak", y = "peaks") +
  scale_y_continuous(
    expand = expansion(mult = c(0, 0.01)),
    labels = scales::label_number(scale = 0.001, suffix = "K")
  ) +
  scale_x_continuous(breaks = seq(1, max(max_assign_peaks_per_gene$N)))

gb1 + gb2

Calculate gene accessibility scores as weighted sum of normalized peak scores assigned to genes.

assign <- fread(file.path(res_dir, "consensusSeekeR-peaks-gene-assignment.tsv"))
setnames(assign, "seqnames", "chr")

genes_gff <- genes_gr#[!genes_gr$name %in% exclude_genes]
genes_gff$symbol <- genes_gff$name

peaks_mt <- read.table(file.path(res_dir, "mat_norm.tsv"))

# cells_groups <- fread(file.path(res_dir, "design.tsv"))[, .(sample, group)]
# setnames(cells_groups, "sample", "cell")

gscore <- mta_gene_scores(
  genes_peaks_table = assign,
  gff_object = genes_gff,
  peak_object = peaks_gr,
  peaks_mat = peaks_mt
)

saveRDS(gscore, file.path(res_dir, "consensusSeekeR-gene-scores.rds"))
write.table(
  gscore$genes_scores_matrix,
  file.path(res_dir, "consensusSeekeR-gene-scores.tsv"),
  sep = "\t",
  col.names = TRUE
)

Use normalized gene scores

# load gene scores
norm_mt <- read.table(
  file.path(res_dir, "consensusSeekeR-gene-scores.tsv"),
  header = TRUE
)
# row normalize to bring genes to same range
norm_mt <- (norm_mt + 10) / apply(norm_mt + 10, 1, median) 
norm_mt <- log2(norm_mt)
con_mt <- read.table(
  file.path(res_dir, "consensusSeekeR-gene-scores-raw.tsv"),
)

# gene markers by high FC + significant DEseq2 LTR test
dds <- DESeqDataSetFromMatrix(
  countData = con_mt,
  colData = col_df,
  design = ~ condition + reporterline + condition:reporterline
)
dds <- DESeq(dds, test = "LRT", reduced = ~ 1)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
dds_res <- results(dds)
dds_qval <- dds_res$padj
names(dds_qval) <- rownames(dds_res)
genes_deseq <- names(which(dds_qval < 1e-2))

genes_high <- names(which(apply(norm_mt, 1, function (x)
  sort(x, decreasing = TRUE)[1] > 1.8
)))

gene_marks <- intersect(genes_high, genes_deseq)

# inclusde golden markers
marks_gold <- fread(
  file.path("annotation", "golden-marks-230116.tsv"),
  fill = TRUE
)[, 1:2]
setnames(marks_gold, c("gene", "name"))
marks_gold[, name := str_remove(name, "^Nv")]
gene_marks <- unique(c(gene_marks, marks_gold[gene %in% genes_deseq]$gene))

Select number of clusters for genes

set.seed(1950)

# determine k for kmeans
ks <- 1:30
tot_withinss <- sapply(ks,  function(k) {
  cl <- kmeans(norm_mt[gene_marks, ], k)
  cl$tot.withinss
})
elbow_dt <- data.table(k = ks, tot_withinss = tot_withinss)
elbow_gp <- ggplot(elbow_dt, aes(x = k, y = tot_withinss)) +
  geom_line() + geom_point() +
  scale_x_continuous(breaks = ks) + 
  theme(panel.grid.major = element_line(size = 0.5))
elbow_gp

Cluster genes

set.seed(1950)

# kmeans
k <- 18
cl <- kmeans(norm_mt[gene_marks, ], k)
gene_order_list <- tapply(names(cl$cluster), cl$cluster, function(gs) {
  cor_genes <- cor(t(norm_mt[gs,]))
  hclust_genes <- hclust(as.dist(1 - cor(cor_genes)), method = "ward.D2")
  rownames(cor_genes)[hclust_genes$order]
})
names(gene_order_list) <- unique(cl$cluster)
gene_order_list <- gene_order_list[as.character(seq_along(gene_order_list))]

# cluster clusters
cluster_order <- hclust(
  dist(cor(t(cl$centers)),
  method = "euclidean"),
  method = "ward.D2"
)$order
gene_order_list <- gene_order_list[as.character(cluster_order)]
gene_order <- unname(unlist(gene_order_list[cluster_order]))
clusters_dt <- data.table(
  gene = unlist(gene_order_list),
  clusters = as.character(
    rep(names(gene_order_list), sapply(gene_order_list, length))
  )
)

# group clusters (manually)
clusters_dt[clusters %in% c(8, 15), group := 1] # Elav
clusters_dt[clusters %in% c(1, 4), group := 2] # Elav + Fox
clusters_dt[clusters %in% c(16, 2), group := 3] # Fox
clusters_dt[clusters %in% c(3), group := 4] # Fox + Ncol
clusters_dt[clusters %in% c(13), group := 5] # Fox + Ncol + Elav
clusters_dt[clusters %in% c(17, 5), group := 6] # Elav + Ncol
clusters_dt[clusters %in% c(7, 14, 6, 10, 11, 9), group := 7] # Ncol
clusters_dt[clusters %in% c(12), group := 8]
clusters_dt[clusters %in% c(18), group := 9]
setorder(clusters_dt, group)
gene_order <- clusters_dt$gene

# save
fwrite(
  clusters_dt,
  file.path(res_dir, "genes_clusters.tsv"),
  sep = "\t"
)

Heatmap of markers

# order rows and columns
col_dt <- fread(file.path(res_dir, "design.tsv"))
samples_order <- col_dt[order(condition, reporterline)]$sample
plot_mt <- norm_mt[gene_order, samples_order]

# center at 0
plot_min <- quantile(abs(range(plot_mt)), 0.75)
plot_min <- 4
plot_mt <- pmin(pmax(plot_mt, -plot_min), plot_min)

# heatmap colors
col_vec <- colorRampPalette(RColorBrewer::brewer.pal(11, 'BrBG'))(1000)
col_fun <- circlize::colorRamp2(
  seq(-plot_min, plot_min, length.out = length(col_vec)),
  col_vec
)

# color annotations
col_ann <- HeatmapAnnotation(
    which = "column",
    border = TRUE,
    "reporterline" = as.character(
      col_dt[match(colnames(plot_mt), sample)]$reporterline
    ),
    "condition" = as.character(
      col_dt[match(colnames(plot_mt), sample)]$condition
    ),
    col = list("reporterline" = line_cols, "condition" = condition_cols)
)

# gene module annotations
clann <- clusters_dt[match(rownames(plot_mt), gene)]$clusters
clann_lab <- unique(clusters_dt[match(rownames(plot_mt), gene)]$clusters)
clann <- factor(clann, levels = clann_lab)
module_ann <- HeatmapAnnotation(
    which = "row", border = TRUE,
    "cluster" = anno_block(
      labels = clann_lab,
      gp = gpar(col = NA)
    )
)
row_split <- clann

# gene module annotations (manual groups)
grann <- clusters_dt[match(rownames(plot_mt), gene)]$group
grann_lab <- unique(clusters_dt[match(rownames(plot_mt), gene)]$group)
grann <- factor(grann, levels = grann_lab)
module_ann <- HeatmapAnnotation(
    which = "row", border = TRUE,
    "group" = anno_block(
      labels = grann_lab,
      gp = gpar(col = NA)
    )
)
row_split <- grann
# gene module annotations (manual groups)
grann <- clusters_dt[match(rownames(plot_mt), gene)]$group
grann_lab <- unique(clusters_dt[match(rownames(plot_mt), gene)]$group)
grann <- factor(grann, levels = grann_lab)
module_ann <- HeatmapAnnotation(
    which = "row", border = TRUE,
    "group" = anno_block(
      labels = grann_lab,
      gp = gpar(col = NA)
    )
)
row_split <- grann

# genes annotations
marks_gold <- fread(
  file.path("annotation", "golden-marks-230116.tsv"),
  fill = TRUE
)[, 1:2]
setnames(marks_gold, c("gene", "name"))
marks_gold[, name := str_remove(name, "^Nv")]

tfs_annt <- fread(
  file.path("annotation", "curated_TFh_Nvec_DToL_names.tsv"),
  header = FALSE
)
setnames(tfs_annt, c("gene", "pfam", "og"))

marks_tfs <- merge.data.table(
  tfs_annt,
  marks_gold,
  by = "gene", all = TRUE, sort = FALSE
)
marks_tfs[is.na(marks_tfs)] <- ""
marks_tfs <- marks_tfs[gene %in% rownames(plot_mt)]
marks_tfs[
  gene %in% clusters_dt[group==5]$gene,
  name := str_replace_all(og, c(
    "HMGbox_Sox.HG1.25:like:BHMG1/SOX1/SOX2/SOX3/SOX14/SOX15/SOX21/SRY:likeclu:18/26/28" = "SOX1/2/3/14/15/21-like",
    "zf-C2H2.HG5.17:PRDM6" = "zf-C2H2 PRDM6",
    "zf-C2H2.Unclassified" = ""
  ))
]
marks_tfs <- marks_tfs[name != ""]

row_labels_marks_ids <- match(marks_tfs$gene, rownames(plot_mt))
row_labels_marks <- marks_tfs[
  match(rownames(plot_mt)[row_labels_marks_ids], gene)
]$name
mark_ann <- HeatmapAnnotation(
    which = "row", border = TRUE,
    "padj<0.01" = rownames(plot_mt) %in% genes_deseq,
    # "var>1" = rownames(plot_mt) %in% genes_vari,
    #"FC>2" = rownames(plot_mt) %in% genes_fc,
    "marker" = anno_mark(
      at = row_labels_marks_ids,
      labels = row_labels_marks
    ),
    col = list(
      "padj<0.01" = c("TRUE" = "#e6ab02", "FALSE"="#d9d9d9"),
      "var>1" = c("TRUE" = "#3690c0", "FALSE"="#d9d9d9"),
      "FC>2" = c("TRUE" = "#e7298a", "FALSE"="#d9d9d9")
    )
)

# heatmap
hm <- Heatmap(
  plot_mt, name = "normalized\naccessibility",
  col = col_fun,
  cluster_rows = FALSE, cluster_columns = FALSE,
  show_row_names = FALSE, show_column_names = TRUE,
  row_title = "genes",
  row_split = row_split,
  cluster_row_slices = FALSE,
  top_annotation = col_ann, 
  right_annotation = mark_ann,
  left_annotation = module_ann,
  border = TRUE
)
hm

Motifs

Score archetype motifs in consensus peaks

require(universalmotif)
require(monalisa)

# motifs
mots_lst <- readRDS(
  file.path("annotation", "motif-archetypes-PPM-PCCnorm-0.8-IC0.5-8bp-pwms.rds")
)
mona_lst <- mta_convert_umot_to_monalisa(mots_lst)

# peaks
peaks <- fread(file.path(cns_dir, "consensusSeekeR-peaks.bed"))
setnames(peaks, c("seqnames", "start", "end", "name", "score", "strand"))
peaks_gr <- makeGRangesFromDataFrame(peaks, keep.extra.columns = TRUE)

# genome
genome <- Biostrings::readDNAStringSet("genome/Nvec_vc1.1_gDNA.fasta")
seqdt <- fread("genome/Nvec_vc1.1_gDNA.fasta.fai")[,1:2]
seqlvl <- c(seqdt[[1]], "ENA|OW052000|OW052000.1")

# scanning
mta_scores_mona <- mta_gw_motif_score_monalisa(
  motifs = mona_lst,
  genome_object = genome,
  index_object = seqdt,
  bin_width = 250,
  subsample_fraction = 0.10,
  score_quantiles = c(0, 0.25, 0.5, 0.75, 0.95, 0.98, 0.99, 0.995, 0.999, 1.0),
  score_quantile_thr = 0.95,
  do_gw_scan = TRUE,
  given_gr = peaks_gr,
  nthreads = 16
)
saveRDS(
  mta_scores_mona, 
  file.path(res_dir, "motif-scores-archetypes-mona.rds")
)

# map motifs back to peaks
mta_hits <- mta_scores_mona$gw_scan
mta_cent <- narrow(mta_hits, start = width(mta_hits)/2, width = 1)
mta_ovls <- findOverlaps(query = mta_hits, subject = peaks_gr)
mta_scor <- mta_hits[queryHits(mta_ovls)]
pro_scor <- peaks_gr[subjectHits(mta_ovls)]
mcols(mta_scor) <- cbind(mcols(mta_scor), mcols(pro_scor))

# add genes
assign <- fread(
  file.path(res_dir, "consensusSeekeR-peaks-gene-assignment.tsv")
)[, .(peak, gene)]
mta_data <- as.data.table(mta_scor)
mta_data[, score := NULL]
setnames(mta_data, c("name", "motif_score"), c("peak", "score"))
mta_annt <- merge.data.table(
  mta_data, assign,
  by = "peak",
  all.x = TRUE,
  allow.cartesian = TRUE
)
mta_annt[is.na(gene), gene := ""]
setcolorder(mta_annt, colnames(mta_data))

# order
mta_annt[, peak := factor(peak, levels = peaks$name)]
setkey(mta_annt, peak)
mta_annt <- unique(mta_annt)

# save
fwrite(
  mta_annt,
  file.path(res_dir, "motif-scores-archetypes-mona-annot.tsv"),
  sep = "\t"
)

Motif enrichment in groups defined before

# load peaks clustering to groups
pks <- fread(
  file.path(res_dir, "consensusSeekeR-peaks-clusters.bed")
)
setnames(pks, c("V4", "V8"), c("peak", "group"))
pks[, peak := factor(peak, levels = unique(pks$peak))]
pks[, group := factor(
  group,
  levels = paste0("G", sort(as.integer(str_remove(unique(pks$group), "G"))))
)]
setkey(pks, group, peak)

mta_scores_mona <- readRDS(
  file.path(res_dir, "motif-scores-archetypes-mona.rds")
)

sites_gr <- mta_scores_mona$gw_scan
sites_gr$name <- sites_gr$motif

enr_list <- lapply(as.character(levels(pks$group)), function(g) {

  enr_df <- mta_motif_enrichment_test(
    sites_object = sites_gr,
    peaks_object = peaks_gr,
    fg_list = peaks_gr[peaks_gr$name %in% pks[group == g]$peak]$name,
    bg_list = peaks_gr[!peaks_gr$name %in% pks[group == g]$peak]$name,
    thresholds_vector = NULL, # mta_scores_mona$score_quantiles[,"0.95"],
    label = g,
    nthreads = 2,
    pval_adjust = "BH"
  )
  setDT(enr_df)
  enr_df

})
enr_dt <- rbindlist(enr_list)

# save
fwrite(
  enr_dt,
  file.path(res_dir, "motif-enrich-archetypes-mona.tsv"),
  sep = "\t"
)

Cluster significant motifs

# enrichment scores
dt <- fread(
  file.path(res_dir, "motif-enrich-archetypes-mona.tsv")
)
setnames(dt, c("label", "motif"), c("group", "archetype_name"))
dt[, motif := str_extract(archetype_name, "ARCH\\d+")]

# motif to tf assignment
dc <- fread(file.path(
  "annotation",
  "assignment-archetype-motif-gene.tsv.gz"
))[, .(gene, archetype_name)]
dc[, motif := str_extract(archetype_name, "ARCH\\d+")]
dc[, archetype_name := NULL]

# combine
dt <- merge.data.table(
  dt, dc, by = "motif",
  all.x = TRUE, sort = FALSE, allow.cartesian = TRUE
)

# qvalue
dat_qv <- dcast.data.table(
  unique(dt[, .(motif, group, padj)]), motif ~ group, value.var = "padj"
)
mat_qv <- data.matrix(dat_qv)[, -1]
mat_qv[is.na(mat_qv)] <- 1
rownames(mat_qv) <- dat_qv$motif
write.table(
  mat_qv,
  file.path(
    res_dir, "motif-enrich-archetypes-mona-qvalue.tsv"
  ),
  sep = "\t", quote = FALSE
)

# log2fc
dat_fc <- dcast.data.table(
  unique(dt[, .(motif, group, fc)]), motif ~ group, value.var = "fc"
)
mat_fc <- data.matrix(dat_fc)[, -1]
mat_fc[is.na(mat_fc)] <- 0
rownames(mat_fc) <- dat_fc$motif
write.table(
  mat_fc,
  file.path(res_dir, "motif-enrich-archetypes-mona-fc.tsv"),
  sep = "\t", quote = FALSE
)

# highly significant higher value
dt[, minuslog10qval := -1 * log10(padj)]

# filtering
ids <- apply(mat_fc, 1, function(x) max(x) > 1) &
  apply(mat_qv, 1, function(x) !is.infinite(min(x)) & !min(x) > 0.001)

# clustering
hc <- hclust(dist(mat_fc[ids, ]), method = "ward.D2")
ds <- dt[motif %in% rownames(mat_fc)[ids]]
ds[, motif := factor(motif, levels = rev(rownames(mat_fc)[ids]))]

# ordering
mord <- order(apply(mat_fc[ids,], 1, which.max))
ds[, motif := factor(motif, levels = rev(rownames(mat_fc[ids, ])[mord]))]

# limit -log10FDR range
ds[, minuslog10qval := pmin(minuslog10qval, 20)]

# golden marks annotations
marks_gold <- fread(
  file.path("annotation", "golden-marks-231124.tsv"),
  sep = "\t", header = FALSE, fill = TRUE
)[, c(2, 1)]
setnames(marks_gold, c("gene", "name"))

# tf annotations
tfs_annt <- fread(
  file.path("annotation", "tfs.Nvec.tsv"),
  header = TRUE
)[, .SD[1], gene]

# add annotations
marks_tfs <- merge.data.table(
  tfs_annt,
  marks_gold,
  by = "gene", all = TRUE, sort = FALSE
)
marks_tfs[is.na(marks_tfs)] <- ""
mt_marks_dt <- merge.data.table(
  ds, marks_tfs,
  by = "gene",
  all.x = TRUE
)
mt_marks_dt[, motif := factor(motif, levels = levels(ds$motif))]
setorder(mt_marks_dt, motif)
mt_marks_dt[is.na(name), name := ""]
mt_marks_dt[name == "" & pfam != "", name := pfam]
mt_marks_dt[name == "" & og != "", name := og]

# add labels
mt_marks_dt[, label := name]
mt_marks_dt[label == "", label := archetype_name]
mt_marks_dt[!grepl("^ARCH", label), label := paste(motif, label)]
mt_marks_dt[, label := substr(label, 1, 35)]
mt_marks_dt[, label := factor(label, levels = unique(mt_marks_dt$label))]

All significant motifs enrichment heatmap

# plot
gp <- ggplot(mt_marks_dt, aes(group, label, label = label)) +
  geom_point(
    aes(size = minuslog10qval, fill = fc),
    shape = 21,
    color = "black"
  ) +
  scale_fill_gradientn(
    name = "enrichmnet\nfold change",
    # breaks = c(0, 1, 2, 4, 6),
    colours = c(
      "white", "#fee5d9", "#fcae91", "#fb6a4a", "#de2d26", "#a50f15", "#7a0105"
    )
  ) +
  scale_size_continuous(
    name = "-log10 FDR",
    breaks = c(0, 10, 20)
  ) +
  theme(
    panel.grid.major = element_line(size = 0.25),
    axis.text.y = element_text(size = 8),
    axis.text.x = element_text(size = 8, angle = 90, vjust = 0.5, hjust = 1),
    legend.direction = "vertical"
  )

Significant motifs assigned to TFs enrichment heatmap

mt_marks_tf <- mt_marks_dt[!is.na(gene)]
mt_marks_tf[, label := paste(
  substr(name, 1, 45), gene, motif, sep = " | "
)]
mt_marks_tf[, label := factor(label, levels = unique(mt_marks_tf$label))]


# plot
gp <- ggplot(mt_marks_tf, aes(group, label, label = label)) +
  geom_point(
    aes(size = minuslog10qval, fill = fc),
    shape = 21,
    color = "black"
  ) +
  scale_fill_gradientn(
    name = "enrichmnet\nfold change",
    # breaks = c(0, 1, 2, 4, 6),
    colours = c(
      "white", "#fee5d9", "#fcae91", "#fb6a4a", "#de2d26", "#a50f15", "#7a0105"
    )
  ) +
  scale_size_continuous(
    name = "-log10 FDR",
    breaks = c(0, 10, 20)
  ) +
  theme(
    panel.grid.major = element_line(size = 0.25),
    axis.text.y = element_text(size = 8),
    axis.text.x = element_text(size = 8, angle = 90, vjust = 0.5, hjust = 1),
    legend.direction = "vertical"
  )
gp

How many expressed TFs have motifs?

# TF annotation
tfs_annt <- fread(
  file.path("annotation", "tfs.Nvec.tsv"),
  header = TRUE
)[, .SD[1], gene]

# groupped genes
mark_dt <- fread(file.path(
  "RNASEQ_QUANTIFICATION", "results", "genes_clusters.tsv"
))
mark_dt[, group := paste0("G", group)]
gene_dt <- fread(file.path(
  "RNASEQ_QUANTIFICATION", "results", "genes_clusters_predicted.tsv"
))
gene_dt[, group := group_xgb]
group_dt <- rbindlist(list(
  clustered = mark_dt[, .(gene, group)],
  predicted = gene_dt[, .(gene, group)]
),  idcol = "method")
group_dt[, group := factor(
  group,
  levels = paste0("G", unique(
    sort(as.integer(str_remove(group_dt$group, "G")))
  ))
)]
message(sprintf(
  "Total genes: %i", length(group_dt$gene)
))
## Total genes: 20535
# 20535

# expressed genes
con_mt <- read.table(
  file.path("RNASEQ_QUANTIFICATION", "raw_counts_rnaseq.tsv"),
  header=TRUE, row.names = 1
)
expr_genes <- names(which(apply(con_mt, 1, function(x) sum(x) > 100) == TRUE))
message(sprintf(
  "Expressed genes: %i", length(expr_genes)
))
## Expressed genes: 19288
# 19288

# expressed TFs
expr_tfs <- expr_genes[expr_genes %in% tfs_annt$gene]
message(sprintf(
  "Expressed TFs: %i", length(expr_tfs)
))
## Expressed TFs: 740
# 740

# motif to tf assignment
dc <- fread(file.path(
  "annotation",
  "assignment-archetype-motif-gene.tsv.gz"
))[, .(gene, archetype_name)]

# expressed TFs with motif
expr_dt <- group_dt[gene %in% expr_genes]
expr_dt[, TF := gene %in% expr_tfs]
expr_dt[, TF := factor(TF, levels = c(TRUE, FALSE))]
setorder(expr_dt, group, TF)
expr_dt <- merge.data.table(
  expr_dt, dc, by = "gene", all.x = TRUE, sort = FALSE
)
expr_tfs_mot <- unique(expr_dt[TF==TRUE & !is.na(archetype_name)]$gene)
message(sprintf(
  "Expressed TFs with motif assigned: %i/%i (%.2f%%)",
  length(expr_tfs_mot), length(expr_tfs),
  length(expr_tfs_mot) / length(expr_tfs) * 100
))
## Expressed TFs with motif assigned: 589/740 (79.59%)
# 589/740 (79.59%)

Footprinting

Merge replicates per condition

bam_dir="ATACSEQ/nucleosome_free_regions/bam/"
nth=18

for line in Fox Elav Ncol
do
  for cond in pos neg
  do
    name=${line}_${cond}
    echo ${name}
    bams=$( echo ${bam_dir}/${name}*bam )
    samtools merge -@ ${nth} ${bam_dir}/${name}.ncfree.bam ${bams}
    samtools sort -@ ${nth} -o ${bam_dir}/${name}.ncfree.sorted.bam ${bam_dir}/${name}.ncfree.bam
    rm ${bam_dir}/${name}.ncfree.bam
    samtools index -@ ${nth} ${bam_dir}/${name}.ncfree.sorted.bam
  done
done

Footprint score calculation for consensus peaks

conda activate TOBIAS_ENV
bam_dir="ATACSEQ/nucleosome_free_regions/bam/"
out_dir="ATACSEQ/nucleosome_free_regions/footprint/"
mkdir ${out_dir}
peaks="ATACSEQ/nucleosome_free_regions/consensus_peaks/consensusSeekeR-peaks.bed"
genome="genome/Nvec_vc1.1_gDNA.fasta"
nth=12

for line in Fox Elav Ncol
do
  for cond in pos neg
  do
    name=${line}_${cond}
    
    echo $(date) "- Starting ATACorrect for" ${name}
    TOBIAS ATACorrect \
      --bam ${bam_dir}/${name}.ncfree.sorted.bam \
      --genome ${genome} \
      --peaks ${peaks} \
      --prefix ${name} \
      --outdir ${out_dir}/ATACorrect \
      --cores ${nth}
    
    echo $(date) "- Starting FootprintScores"
    TOBIAS FootprintScores \
      --signal ${out_dir}/ATACorrect/${name}_corrected.bw \
      --regions ${peaks} \
      --fp-min 10 --fp-max 50 \
      --output ${out_dir}/${name}_footprints.bw \
      --cores ${nth}
    
  done
done

Plot difference in footprints

conda activate TOBIAS_ENV
bam_dir="ATACSEQ/nucleosome_free_regions/bam/"
bwg_dir="ATACSEQ/nucleosome_free_regions/footprint/ATACorrect"
plt_dir="ATACSEQ/nucleosome_free_regions/footprint/plots"
mkdir ${plt_dir}
peaks="ATACSEQ/nucleosome_free_regions/consensus_peaks/consensusSeekeR-peaks.bed"

for line in Fox Elav Ncol
do
  TOBIAS PlotAggregate --TFBS ${peaks} \
    --signals ${bwg_dir}/${line}_pos_corrected.bw ${bwg_dir}/${line}_neg_corrected.bw \
    --output ${plt_dir}/${line}_footprint_comparison_all_peaks.pdf \
    --flank 125 \
    --share_y both \
    --plot_boundaries \
    --signal-on-x
done

TOBIAS PlotAggregate --TFBS ${peaks} \
  --signals ${bwg_dir}/Elav_pos_corrected.bw ${bwg_dir}/Fox_pos_corrected.bw ${bwg_dir}/Ncol_pos_corrected.bw \
  --output ${plt_dir}/pos_footprint_comparison_all_peaks.pdf \
  --flank 150 \
  --share_y both \
  --plot_boundaries \
  --signal-on-x

TOBIAS motif scores

Combine motif scores and footprint scores to determine motif binding.

mamba activate tobias
bwg_dir="ATACSEQ/nucleosome_free_regions/footprint/"
out_dir="ATACSEQ/nucleosome_free_regions/footprint/BINDetect"
mkdir ${out_dir}
peaks="ATACSEQ/nucleosome_free_regions/consensus_peaks/consensusSeekeR-peaks.bed"
motifs="annotation/motif-archetypes-PPM-PCCnorm-0.8-IC0.5-8bp-pwms.meme"
genome="genome/Nvec_vc1.1_gDNA.fasta"
nth=12

# have to split motifs in multiple files because of the open file limit
#for x in {1..4}
#do
#  motifs="annotation/motif-archetypes-PPM-PCCnorm-0.8-IC0.5-8bp-pwms-binned/motif-archetypes-PPM-PCCnorm-0.8-IC0.5-8bp-pwms-bin"${x}".meme"

# alternatively, motifs for TFs only
  x="tfs"
  motifs="annotation/motif-archetypes-PPM-PCCnorm-0.8-IC0.5-8bp-pwms-tfs.meme"

  line1=Fox
  line2=Elav
  line3=Ncol

  mkdir ${out_dir}/${x}
  echo $(date) "- Starting BINDetect for" ${x} 

  TOBIAS BINDetect \
    --motifs ${motifs} \
    --signals \
      ${bwg_dir}/${line1}_pos_footprints.bw \
      ${bwg_dir}/${line1}_neg_footprints.bw \
      ${bwg_dir}/${line2}_pos_footprints.bw \
      ${bwg_dir}/${line2}_neg_footprints.bw \
      ${bwg_dir}/${line3}_pos_footprints.bw \
      ${bwg_dir}/${line3}_neg_footprints.bw \
    --genome ${genome} \
    --peaks ${peaks} \
    --outdir ${out_dir}/${x} \
    --cond_names \
      ${line1}_pos ${line1}_neg \
      ${line2}_pos ${line2}_neg \
      ${line3}_pos ${line3}_neg \
    --cores ${nth}

#done

Combine motif binding scores

bind_dir <- file.path(dat_dir, "footprint", "BINDetect")
bind_dt <- rbindlist(lapply(c("tfs"), function(x) {
  fn <- list.files(
    file.path(bind_dir, x),
    pattern = "bindetect_results.txt", full.names = TRUE
  )
  dt <- fread(fn)
  dt[, name := NULL]
  dt[, output_prefix := NULL]
  dt[, cluster := NULL]
  dt
}))
fwrite(bind_dt, file.path(
  bind_dir, "bindetect_results.txt"
), sep = "\t")

Volcano plots

# bindetect results
bind_dt <- fread(file.path(
  bind_dir, "bindetect_results.txt"
))

# motif to tf assignment
dc <- fread(file.path(
  "annotation",
  "assignment-archetype-motif-gene.tsv.gz"
))[, .(gene, archetype_name)]
dc[, motif := str_extract(archetype_name, "ARCH\\d+")]
dc[, archetype_name := NULL]
dc_tf <- merge.data.table(
  dc, tfs_annt, by = "gene", all.x = TRUE
)
dc_tf[gene %in% names(marks_gold_v), name := marks_gold_v[gene]]

# coparisons
comparas <- list(
  c("Fox_pos", "Fox_neg"),
  c("Elav_pos", "Elav_neg"),
  c("Ncol_pos", "Ncol_neg"),
  c("Fox_pos", "Elav_pos"),
  c("Fox_pos", "Ncol_pos"),
  c("Elav_pos", "Ncol_pos")
)

# volcano plots
gpv_list <- lapply(comparas, function(x) {
  cols <- c(
    "motif_id",
    sprintf("%s_%s_change", x[[1]], x[[2]]),
    sprintf("%s_%s_pvalue", x[[1]], x[[2]])
  )
  dt <- unique(bind_dt[, ..cols])
  setnames(dt, c("motif_name", "log2FoldChange", "pvalue"))
  dt[, motif := str_extract(motif_name, "ARCH\\d+")]
  dt <- merge.data.table(dt, dc_tf, by = "motif", all.x = TRUE)
  dt[, motif_label := name]
  dt[motif_label == "", name := pfam]
  dt[, motif_label := strtrim(motif_label, 45)]
  dt[, minuslog10pval := -1 * log10(pvalue)]
  dt[, minuslog10padj := NA]
  ggplotVolcano(
    dt,
    padj_thr = NULL, pval_thr = 0.05,
    lfc_thr = 0.2,
    sign_col = c("red", "blue"),
    lims_fc = c(-0.5, 0.5),
    lims_sig = c(0, 150),
    title = sprintf("%s vs %s", x[[1]], x[[2]]),
    xlab = "differential binding score",
    y = "-log10(p value)",
    label_column = "motif_label", label_size = 1, label_segment_alpha = 0.2,
    label_fc_thr = 0.2
  ) +
    theme(panel.grid.major = element_line(color = "gray", linewidth = 0.2))
})

gpv <- (wrap_plots(gpv_list, nrow = 2) & theme(legend.position = "bottom")) +
  plot_layout(guides = "collect")
gpv
## Warning: Removed 28 rows containing missing values (`geom_text_repel()`).
## Warning: Removed 10 rows containing missing values (`geom_text_repel()`).
## Warning: Removed 2 rows containing missing values (`geom_text_repel()`).
## Warning: Removed 17 rows containing missing values (`geom_text_repel()`).
## Warning: Removed 71 rows containing missing values (`geom_text_repel()`).
## Warning: Removed 7 rows containing missing values (`geom_text_repel()`).
## Warning: Removed 8 rows containing missing values (`geom_text_repel()`).
## Warning: Removed 22 rows containing missing values (`geom_text_repel()`).
## Warning: Removed 1 rows containing missing values (`geom_text_repel()`).
## Warning: Removed 38 rows containing missing values (`geom_text_repel()`).
## Warning: Removed 24 rows containing missing values (`geom_text_repel()`).

Save parsed results

# parse data
bind_dt <- rbindlist(lapply(comparas, function(x) {
  cols <- c(
    "motif_id",
    sprintf("%s_%s_change", x[[1]], x[[2]]),
    sprintf("%s_%s_pvalue", x[[1]], x[[2]])
  )
  dt <- unique(bind_dt[, ..cols])
  setnames(dt, c("motif_name", "log2FoldChange", "pvalue"))
  dt[, motif := str_extract(motif_name, "ARCH\\d+")]
  dt <- merge.data.table(dt, dc_tf, by = "motif", all.x = TRUE)
  dt[, compara := sprintf("%s_%s", x[[1]], x[[2]])]
  dt
}))
fwrite(bind_dt, file.path(
  bind_dir, "bindetect_results_parsed.txt"
), sep = "\t")
fwrite(bind_dt[pvalue < 0.05], file.path(
  bind_dir, "bindetect_results_significant.txt"
), sep = "\t")

Parse all binding scores per sample (from all comparisons)

# load diff motif binding results
hm_dt <- fread(file.path(bind_dir, "bindetect_results_parsed.txt"))
#hm_dt <- fread(file.path(bind_dir, "bindetect_results_significant.txt"))

# get motifs from all comparisons per line
diff_mots_dt <- rbindlist(sapply(c("Elav", "Fox", "Ncol"), function(smp) {
  # select motifs from comparison with neg control
  dt_crt <- hm_dt[compara == sprintf("%s_pos_%s_neg", smp, smp)]
  # select motifs from comparison with other lines
  if (smp == "Elav") {
    dt_dif <- rbindlist(list(
      "neg" = dt_crt,
      "Fox" = hm_dt[compara == "Fox_pos_Elav_pos"][
        , log2FoldChange := -1 * log2FoldChange],
      "Ncol" = hm_dt[compara == "Elav_pos_Ncol_pos"]
    ), idcol = "vs")
  } else if (smp == "Fox") {
    dt_dif <- rbindlist(list(
      "neg" = dt_crt,
      "Elav" = hm_dt[compara == "Fox_pos_Elav_pos"],
      "Ncol" = hm_dt[compara == "Fox_pos_Ncol_pos"]
    ), idcol = "vs")
  } else if (smp == "Ncol") {
    dt_dif <- rbindlist(list(
      "neg" = dt_crt,
      "Fox" = hm_dt[compara == "Fox_pos_Ncol_pos"][
        , log2FoldChange := -1 * log2FoldChange],
      "Elav" = hm_dt[compara == "Elav_pos_Ncol_pos"][
        , log2FoldChange := -1 * log2FoldChange]
    ), idcol = "vs")
  }
  dt_dif[, compara := NULL]
  setcolorder(dt_dif, c("vs", "motif", "log2FoldChange", "pvalue"))
  dt_dif
}, USE.NAMES = TRUE, simplify = FALSE), idcol = "reporterline")

fwrite(
  diff_mots_dt,
  file.path(bind_dir, "bindetect_results_reporterline.txt"),
  sep = "\t"
)
#fwrite(
#  diff_mots_dt,
#  file.path(bind_dir, "bindetect_results_significant_reporterline.txt"),
#  sep = "\t"
#)

Plot dotplot

# BINDetect results
diff_mots_dt <- fread(file.path(
  bind_dir, "bindetect_results_reporterline.txt"
))

# significant
pval_thr <- 0.05
fc_thr <- 0.2
siginifcant_mots <- diff_mots_dt[
  pvalue < pval_thr & abs(log2FoldChange) > fc_thr
]$motif
dp_dt <- diff_mots_dt[motif %in% siginifcant_mots]
dp_dt[, id := paste(reporterline, vs, sep = " vs ")]

# order samples
dp_dt[, reporterline := factor(reporterline, levels = c("Elav", "Fox", "Ncol"))]
dp_dt[, vs := factor(vs, levels = c("neg", "Elav", "Fox", "Ncol"))]
lv_dt <- CJ(
  levels(dp_dt$reporterline),
  levels(dp_dt$vs)
)[V1 != V2][, id := paste(V1, V2, sep = " vs ")]
lv_dt[, V1 := factor(V1, levels = c("Elav", "Fox", "Ncol"))]
lv_dt[, V2 := factor(V2, levels = c("neg", "Elav", "Fox", "Ncol"))]
setorder(lv_dt, V1, V2)
dp_dt[, id := factor(id, levels = lv_dt$id)]
dp_dt[, motif_id := paste(motif, gene, sep = "__")]

# order motifs
gpv_dt <- dcast.data.table(
  dp_dt, motif_id ~ id, value.var = "log2FoldChange"
)
gpv_mt <- as.matrix(gpv_dt[, -1])
rownames(gpv_mt) <- unique(dp_dt$motif_id)
mord <- rev(order(apply(gpv_mt, 1, which.max)))
mord <- hclust(dist(gpv_mt), method = "ward.D2")$order
dp_dt[, motif_id := factor(motif_id, levels = rownames(gpv_mt)[mord])]
setorder(dp_dt, id, motif_id)

# labels
dp_dt[, motif_label := paste(substr(name, 1, 35), gene, motif, sep = " | ")]
dp_dt[, motif_label := factor(motif_label, levels = unique(dp_dt$motif_label))]

# plot
dp_dt[, minuslog10pvalue := -1 * log10(pvalue)]
gp_dot <- ggplot(dp_dt, aes(
  id, motif_label,
  size = minuslog10pvalue, fill = log2FoldChange
)) +
  geom_point(shape = 21) +
  scale_fill_gradientn(
    colours = colorRampPalette(RColorBrewer::brewer.pal(11, "BrBG"))(1000),
    limits = c(-0.35, 0.35), oob = scales::squish, breaks = c(-0.3, 0, 0.3),
    name = "log2(fold change)",
  ) +
  scale_size_continuous(
    name = "-log10(q value)",
    range = c(1, 7),
    breaks = c(10, 50, 100, 150)
  ) +
  theme(
    panel.grid.major = element_line(size = 0.25),
    axis.text.y = element_text(size = 10),
    axis.text.x = element_text(size = 10, angle = 90, vjust = 0.5, hjust = 1),
    axis.title.x = element_blank(),
    legend.direction = "vertical"
  ) +
  facet_grid(. ~ reporterline, scales = "free")

Networks

Parse bound files for all motifs

bound_dt <- rbindlist(lapply("tfs", function(x) {
  message(x)
  mot_dirs <- list.dirs(file.path(bind_dir, x), recursive = FALSE)
  mot_dirs <- grep("ARCH", mot_dirs, value = TRUE)
  mot <- str_remove(basename(mot_dirs), "^_")
  names(mot_dirs) <- mot
  rbindlist(sapply(mot, function(m) {
    message(m)
    fns <- list.files(
      file.path(mot_dirs[m], "beds"),
      pattern = "_bound.bed",
      full.names = TRUE
    )
    names(fns) <- str_extract(basename(fns), "(Elav|Fox|Ncol)_(pos|neg)")
    rbindlist(sapply(names(fns), function(fn) {
      fread(fns[fn])
    }, USE.NAMES = TRUE, simplify = FALSE), idcol = "sample")
  }, simplify = FALSE, USE.NAMES = TRUE))
}))
bound_dt[, V11 := V13][, V13 := NULL]
bed_cols <- c(
  "seqnames", "start", "end", "motif_name", "score", "strand",
  "seqnames_peak", "start_peak", "end_peak",
  "peak", "peak_score", "peak_strand"
)
setnames(bound_dt, c("sample", bed_cols))
bound_dt[, motif_name := str_remove(motif_name, "^_")]

# add motif-to-gene assignments
bound_dt[, motif := str_extract(motif_name, "ARCH\\d+")]
bound_dt <- merge.data.table(
  bound_dt, dc_tf, by = "motif",
  all.x = TRUE, sort = FALSE, allow.cartesian = TRUE
)

# save
fwrite(
  bound_dt,
  file.path(bind_dir, "motifs_bound.tsv.gz"),
  sep = "\t"
)

Subset targets for expressed genes.

# load motif binding in peaks
bound_dt <- fread(file.path(bind_dir, "motifs_bound.tsv.gz"))

# order samples
bound_dt[, sample := factor(sample, levels = c(
  "Elav_pos", "Elav_neg",
  "Fox_pos", "Fox_neg",
  "Ncol_pos", "Ncol_neg"
))]

# add peak to target gene assignment
assign <- fread(file.path(res_dir, "consensusSeekeR-peaks-gene-assignment.tsv"))
assign <- unique(assign[, .(gene, peak)])
setnames(assign, "gene", "target_gene")
bound_dt <- merge.data.table(
  bound_dt, assign, by = "peak",
  all.x = TRUE, sort = FALSE, allow.cartesian = TRUE
)
bound_dt <- bound_dt[!is.na(target_gene)]

# parse only motif + gene + target gene + sample info
mo_dt <- bound_dt[order(score)][, .SD[1], .(motif, gene, target_gene, sample)]
mo_dt <- unique(mo_dt[, .(
  motif, motif_name, gene, name, pfam, target_gene, sample
)])
mo_dt[, n_samples := length(unique(.SD$sample)), .(motif, gene, target_gene)]
setorder(mo_dt, -n_samples, gene, sample)
mo_dt[, sample := factor(sample, levels = c(
  "Ncol_neg", "Fox_neg", "Elav_neg",
  "Ncol_pos", "Fox_pos", "Elav_pos"
))]

# add target gene annotation
target_gene_ann <- copy(gene_ann)
setnames(
  target_gene_ann,
  colnames(target_gene_ann),
  paste0("target_", colnames(target_gene_ann))
)
mo_dt <- merge.data.table(
  mo_dt, target_gene_ann, by = "target_gene",
  all.x = TRUE, sort = FALSE
)
mo_dt[is.na(target_pfam), target_pfam := ""]
mo_dt[is.na(target_name), target_name := ""]

# gene expression
tpm_mt <- read.table(
  file.path("RNASEQ_QUANTIFICATION", "raw_counts_rnaseq.tsv"),
  header = TRUE, row.names = 1
)

# aggregate replicates
smpl <- paste(
  str_extract(colnames(tpm_mt), "Fox|Elav|Ncol"),
  str_to_lower(str_extract(colnames(tpm_mt), "Pos|Neg")),
  sep = "_"
)
smpl_mt <- tapply(colnames(tpm_mt), smpl, function(y) {
  apply(tpm_mt[, y], 1, mean)
}, simplify = FALSE)
smpl_mt <- do.call("cbind", smpl_mt)

# expressed genes
tpm_genes_dt <- rbindlist(apply(smpl_mt, 2, function(x) {
  data.table(tpm = x)[, gene := names(x)]
}, simplify = FALSE), idcol = "sample")
mo_dt <- merge.data.table(
  mo_dt, tpm_genes_dt,
  by = c("gene", "sample"),
  all.x = TRUE, sort = FALSE
)
mo_dt[, expressed := FALSE][tpm > 100, expressed := TRUE]
unique(
  mo_dt[, .(sample, gene, expressed)]
)[, .N, .(sample, expressed)][order(sample, expressed)]
##       sample expressed   N
##  1: Elav_neg     FALSE  81
##  2: Elav_neg      TRUE 517
##  3: Elav_pos     FALSE 119
##  4: Elav_pos      TRUE 479
##  5:  Fox_neg     FALSE  69
##  6:  Fox_neg      TRUE 529
##  7:  Fox_pos     FALSE 123
##  8:  Fox_pos      TRUE 475
##  9: Ncol_neg     FALSE  88
## 10: Ncol_neg      TRUE 510
## 11: Ncol_pos     FALSE 138
## 12: Ncol_pos      TRUE 460
# expressed target genes
tmp_tgenes_dt <- copy(tpm_genes_dt)
setnames(tmp_tgenes_dt, c("gene", "tpm"), c("target_gene", "target_tpm"))
mo_dt <- merge.data.table(
  mo_dt, tmp_tgenes_dt,
  by = c("target_gene", "sample"),
  all.x = TRUE, sort = FALSE
)
mo_dt[, target_expressed := FALSE][target_tpm > 100, target_expressed := TRUE]
unique(
  mo_dt[, .(sample, target_gene, target_expressed)]
)[, .N, .(sample, target_expressed)][order(sample, target_expressed)]
##       sample target_expressed    N
##  1: Elav_neg            FALSE  669
##  2: Elav_neg             TRUE 8377
##  3: Elav_pos            FALSE  682
##  4: Elav_pos             TRUE 7994
##  5:  Fox_neg            FALSE  658
##  6:  Fox_neg             TRUE 8540
##  7:  Fox_pos            FALSE  786
##  8:  Fox_pos             TRUE 7921
##  9: Ncol_neg            FALSE  688
## 10: Ncol_neg             TRUE 8504
## 11: Ncol_pos            FALSE  733
## 12: Ncol_pos             TRUE 7778
# save
setcolorder(mo_dt, c(
  "sample", "gene", "target_gene", "name", "pfam", "target_name", "target_pfam",
  "tpm", "expressed", "target_tpm", "target_expressed", "motif", "motif_name",
  "n_samples"
))
fwrite(
  mo_dt,
  file.path(res_dir, "motifs_genes_targets_long.tsv.gz"),
  sep = "\t"
)

How many target genes for each TF?

hits_dt <- unique(mo_dt[expressed == TRUE][, .(sample, motif, gene, target_gene)])
hits_dt <- hits_dt[, .N, .(motif, sample)]

hits_gp <- ggplot(hits_dt, aes(sample, N, fill = sample)) +
  geom_boxplot() +
  scale_y_continuous(trans = "log10") +
  scale_fill_manual(values = line_cond_cols) +
  labs(y = "number of target genes per TF") +
  theme(
    axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1),
    legend.position = "none",
    panel.grid.major.y = element_line(linewidth = 0.5),
    panel.grid.minor.y = element_line(linewidth = 0.2)
  )
hits_gp